import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json # library to handle JSON files
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
#While scraping the wikipedia page an error occured where the absence of lxml was indicated. This statement imports it. Upon installation, Kernel restart was necessary. /
#conda install -c anaconda lxml
print('Libraries imported.')
conda install -c anaconda lxml
The site in the URL is scraped to obtain the neighborhood data, from the zip codes
import requests
myurl = 'http://www.laalmanac.com/communications/cm02a90001-90899.php'
header = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
r = requests.get(myurl, headers=header)
df1 = pd.read_html(r.text)
LA_zip_community_lookup = df1[0]
LA_zip_community_lookup.rename(columns={'Zip Code':'ZIP_CODE'}, inplace=True)
LA_zip_community_lookup.rename(columns={'Cities/Communities':'NEIGHBORHOOD'}, inplace=True)
LA_zip_community_lookup.head()
The csv file obtained from the link mentioned in the report contains a list of all active businesses in the LA area. This data is being read here for analysis.
dfs = pd.read_csv('listing-of-active-businesses.csv')
dfs.head()
The imported data set is being cleaned up by discarding columns that are not useful for this analysis, removing rows with partial/no information.
losangeles_businesses = dfs.drop(['LOCATION ACCOUNT #', 'DBA NAME', 'LOCATION DESCRIPTION', 'MAILING ADDRESS', 'MAILING ZIP CODE', 'MAILING CITY', 'NAICS', 'COUNCIL DISTRICT', 'LOCATION START DATE', 'LOCATION END DATE', 'Zip Codes', 'Council Districts', 'Census Tracts', 'Precinct Boundaries', 'LA Specific Plans', 'Neighborhood Councils (Certified)'], axis=1)
losangeles_businesses.head()
len(losangeles_businesses)
losangeles_businesses.dropna(inplace=True)
losangeles_businesses.reset_index(drop=True, inplace=True)
len(losangeles_businesses)
losangeles_businesses.head()
temp = losangeles_businesses["ZIP CODE"].str.split("-", n = 1, expand = True)
losangeles_businesses["ZIP CODE"] = temp[0]
losangeles_businesses.head()
Some string manipulation operations are being performed here so that the malformed JSON in the LOCATION column can be interpreted correctly as LATITUDE and LONGITUDE
losangeles_businesses["LOCATION"] = losangeles_businesses["LOCATION"].str.replace("'", "")
losangeles_businesses["LOCATION"] = losangeles_businesses["LOCATION"].str.replace(",", "")
losangeles_businesses["LOCATION"] = losangeles_businesses["LOCATION"].str.replace("}", "")
temp_lat1 = losangeles_businesses["LOCATION"].str.split(" ", n = 1, expand = True)[1]
temp_lat2 = temp_lat1.str.split(" ", n = 1, expand = True)[0]
temp_long1 = losangeles_businesses["LOCATION"].str.split(" ", n = 1, expand = True)[1]
temp_long2 = temp_lat1.str.split(":", n = 1, expand = True)[1]
temp_long2 = temp_long2.str.replace(" ", "")
temp_long2 = temp_long2.str.split("human_address", n = 1, expand = True)[0]
losangeles_businesses["LATITUDE"] = temp_lat2.to_frame()
losangeles_businesses["LONGITUDE"] = temp_long2.to_frame()
losangeles_businesses.drop(columns=["LOCATION"],inplace=True)
losangeles_businesses.head()
len(losangeles_businesses)
The dataset seems to contain a large amount of data, which slows down processing. Hence, for the purposes of this analysis, only the locations around the city of LA is considered. The neighboring suburbs are not studied here and are thus discarded.
losangeles_businesses_only = losangeles_businesses[losangeles_businesses['CITY'].str.contains("LOS ANGELES", case=False, na=False)].reset_index(drop=True)
losangeles_businesses_only.rename(columns={'ZIP CODE':'ZIP_CODE'}, inplace=True)
len(losangeles_businesses_only)
losangeles_businesses_only.head()
losangeles_businesses_only['LATITUDE'] = losangeles_businesses_only['LATITUDE'].astype(float)
losangeles_businesses_only['LONGITUDE'] = losangeles_businesses_only['LONGITUDE'].astype(float)
A master dataset is created here with neighborhood names included by merging the neighborhoods dataset along with the LA active businesses dataset
losangeles_businesses_only.replace('', np.nan, inplace=True)
losangeles_businesses_only.dropna(inplace=True)
losangeles_businesses_only['ZIP_CODE'] = losangeles_businesses_only['ZIP_CODE'].astype(int)
LA_zip_community_lookup['ZIP_CODE'] = LA_zip_community_lookup['ZIP_CODE'].astype(int)
la_businesses_master = pd.merge(losangeles_businesses_only, LA_zip_community_lookup[['NEIGHBORHOOD','ZIP_CODE']], on='ZIP_CODE')
la_businesses_master.head()
The foursquare API is used here to obtain the competing Cuban restaurants
search_query = 'Cuban'
radius = 25000
print(search_query + ' .... OK!')
address = 'Los Angeles, CA'
geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Los Angeles are {}, {}.'.format(latitude, longitude))
CLIENT_ID = 'JBY0YANCCPVYDZJRGUC4PKUJXPXRSCB52IZYBIN3VV4BH3OQ' # your Foursquare ID
CLIENT_SECRET = 'MV4MQUCRVYAIOPQTHU2EGDWRV4SSLTQPHAZUR5LQOC5C1QWX' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 75
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
url
results = requests.get(url).json()
#results
# assign relevant part of JSON to venues
venues = results['response']['venues']
# tranform venues into a dataframe
dataframe = pd.json_normalize(venues)
dataframe.head()
Cleaning up the JSON data obtained from foursquare
# keep only columns that include venue name, and anything that is associated with location
filtered_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')] + ['id']
dataframe_filtered = dataframe.loc[:, filtered_columns]
# function that extracts the category of the venue
def get_category_type(row):
try:
categories_list = row['categories']
except:
categories_list = row['venue.categories']
if len(categories_list) == 0:
return None
else:
return categories_list[0]['name']
# filter the category for each row
dataframe_filtered['categories'] = dataframe_filtered.apply(get_category_type, axis=1)
# clean column names by keeping only last term
dataframe_filtered.columns = [column.split('.')[-1] for column in dataframe_filtered.columns]
dataframe_filtered.head()
Since the search keyword used on Foursquare was 'Cuban', the results contained Smoke Shops and Convention centers as well. Hence the data is being filtered for places that just serve food
dataframe_filtered = dataframe_filtered[dataframe_filtered['categories'].notna()]
df_cleaned = dataframe_filtered[dataframe_filtered['categories'].str.contains('restaurant|food|bar|caf', case=False)]
cuban_competitors = df_cleaned.reset_index(drop=True)
cuban_competitors = cuban_competitors[['name','lat','lng']]
cuban_competitors
All active LA businesses are being visualized in blue on a city map of LA
map_la_businesses = folium.Map(location=[latitude, longitude], zoom_start=10)
# add markers to map
for lat, lng, name in zip(la_businesses_master['LATITUDE'], la_businesses_master['LONGITUDE'], la_businesses_master['BUSINESS NAME']):
label = '{}'.format(name)
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
[lat, lng],
radius=2,
popup=label,
color='blue',
fill=True,
fill_color='#3186cc',
fill_opacity=0.8,
parse_html=False).add_to(map_la_businesses)
map_la_businesses
A flag is added to check if each location is a competitor or not
la_businesses_master['close_to_competitor'] = False
In this part, businesses within a 3km radius from each Cuban Restaurant is flagged as 'Close to Competitor'
from geopy import distance
for clat, clng, clabel in zip(cuban_competitors['lat'], cuban_competitors['lng'], cuban_competitors['name']):
for la_index, la_lat, la_lng, la_street_address, la_business_name in zip(la_businesses_master.index, la_businesses_master['LATITUDE'], la_businesses_master['LONGITUDE'], la_businesses_master['STREET ADDRESS'], la_businesses_master['BUSINESS NAME']):
center_point = [{'lat': clat, 'lng': clng}]
test_point = [{'lat': la_lat, 'lng': la_lng}]
radius = 3 # in kilometer
center_point_tuple = tuple(center_point[0].values()) # (-7.7940023, 110.3656535)
test_point_tuple = tuple(test_point[0].values()) # (-7.79457, 110.36563)
dis = distance.distance(center_point_tuple, test_point_tuple).km
#print(la_index)
if dis < radius:
la_businesses_master.at[la_index, 'close_to_competitor'] = True
print('Completed!')
la_businesses_master.head()
Locations labeled as close to 'competitor' is being filtered out
la_businesses_master.groupby('close_to_competitor').count()
la_businesses_master = la_businesses_master[la_businesses_master['close_to_competitor'] == False]
la_businesses_master.shape
Estimating number of businesses per neighborhood
la_businesses_master.groupby('NEIGHBORHOOD').count()
print('There are {} uniques categories.'.format(len(la_businesses_master['PRIMARY NAICS DESCRIPTION'].unique())))
Identifying the top 10 popular businesses per neighborhood, to get an idea of the economic background of the neighborhood
# one hot encoding
la_onehot = pd.get_dummies(la_businesses_master[['PRIMARY NAICS DESCRIPTION']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
la_onehot['Neighborhood'] = la_businesses_master['NEIGHBORHOOD']
# move neighborhood column to the first column
fixed_columns = [la_onehot.columns[-1]] + list(la_onehot.columns[:-1])
la_onehot = la_onehot[fixed_columns]
la_onehot.head()
la_onehot.shape
la_grouped = la_onehot.groupby('Neighborhood').mean().reset_index()
la_grouped.head()
la_grouped.shape
num_top_venues = 5
for hood in la_grouped['Neighborhood']:
#print("----"+hood+"----")
temp = la_grouped[la_grouped['Neighborhood'] == hood].T.reset_index()
temp.columns = ['venue','freq']
temp = temp.iloc[1:]
temp['freq'] = temp['freq'].astype(float)
temp = temp.round({'freq': 2})
#print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
#print('\n')
def return_most_common_venues(row, num_top_venues):
row_categories = row.iloc[1:]
row_categories_sorted = row_categories.sort_values(ascending=False)
return row_categories_sorted.index.values[0:num_top_venues]
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
try:
columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
except:
columns.append('{}th Most Common Venue'.format(ind+1))
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = la_grouped['Neighborhood']
for ind in np.arange(la_grouped.shape[0]):
neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(la_grouped.iloc[ind, :], num_top_venues)
neighborhoods_venues_sorted.head()
Visuzalizing the active businesses on the map of LA, with businesses close to competitors removed
address = 'Los Angeles, CA'
geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Los Angeles are {}, {}.'.format(latitude, longitude))
map_la_businesses = folium.Map(location=[latitude, longitude], zoom_start=10)
# add markers to map
for lat, lng, name in zip(la_businesses_master['LATITUDE'], la_businesses_master['LONGITUDE'], la_businesses_master['BUSINESS NAME']):
label = '{}'.format(name)
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
[lat, lng],
radius=2,
popup=label,
color='blue',
fill=True,
fill_color='#3186cc',
fill_opacity=0.5,
parse_html=False).add_to(map_la_businesses)
map_la_businesses
In this map, blue markers indicate active businesses, and red markers indicate the competing Cuban restaurants. Understandably, the Cuban restaurants are in the center of the 'no-business' circle.
address = 'Los Angeles, CA'
geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Los Angeles are {}, {}.'.format(latitude, longitude))
#cuban_businesses = folium.Map(location=[latitude, longitude], zoom_start=10)
# add markers to map
for lat, lng, name in zip(cuban_competitors['lat'], cuban_competitors['lng'], cuban_competitors['name']):
label = '{}'.format(name)
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
[lat, lng],
radius=2,
popup=label,
color='red',
fill=True,
fill_color='#3186cc',
fill_opacity=0.5,
parse_html=False).add_to(map_la_businesses)
map_la_businesses
la_businesses_master.head()
Location based Clustering, to identify possible neighborhood clusters to operate the truck from
# set number of clusters
kclusters = 4
la_grouped_clustering = la_businesses_master[['LATITUDE', 'LONGITUDE']]
# run k-means clustering
kmeans = KMeans(init="k-means++", n_clusters=kclusters, n_init=22).fit(la_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]
la_grouped_clustering.head()
# add clustering labels
#la_businesses_master.insert(0, 'Cluster Labels', kmeans.labels_)
#la_merged = la_grouped_clustering
#la_merged.rename(columns={'NEIGHBORHOOD':'Neighborhood'}, inplace=True)
#la_merged.insert(0, 'Cluster Labels', kmeans.labels_)
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
#la_merged = la_merged.join(la_businesses_master.set_index('Neighborhood'), on='Neighborhood')
#la_merged.head() # check the last columns!
la_businesses_master.head()
la_merged.head()
Map with all the location clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lon, cluster in zip(la_merged['LATITUDE'], la_merged['LONGITUDE'], la_merged['Cluster Labels']):
#label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
folium.CircleMarker(
[lat, lon],
radius=5,
popup=label,
color=rainbow[cluster-1],
fill=True,
fill_color=rainbow[cluster-1],
fill_opacity=0.7).add_to(map_clusters)
map_clusters
Count of businesses in each cluster can be seen below. With one of these clusters having way lesser number of businesses, it is clear that those neighborhoods can be of the least priority to sell Cuban food
(la_businesses_master.loc[la_businesses_master['Cluster Labels'] == 0, la_businesses_master.columns[[1] + list(range(5, la_businesses_master.shape[1]))]]).shape
(la_businesses_master.loc[la_businesses_master['Cluster Labels'] == 1, la_businesses_master.columns[[1] + list(range(5, la_businesses_master.shape[1]))]]).shape
(la_businesses_master.loc[la_businesses_master['Cluster Labels'] == 2, la_businesses_master.columns[[1] + list(range(5, la_businesses_master.shape[1]))]]).shape
(la_businesses_master.loc[la_businesses_master['Cluster Labels'] == 3, la_businesses_master.columns[[1] + list(range(5, la_businesses_master.shape[1]))]]).shape
The neighborhoods below are arranged in descending order, displaying neighborhoods with most businesses first, in each cluster. Thus, the ideal locations for selling Cuban food has now been determined.
(la_businesses_master.loc[la_businesses_master['Cluster Labels'] == 0, la_businesses_master.columns[[1] + list(range(5, la_businesses_master.shape[1]))]]).groupby('NEIGHBORHOOD').count().sort_values('BUSINESS NAME',ascending=False)
(la_businesses_master.loc[la_businesses_master['Cluster Labels'] == 1, la_businesses_master.columns[[1] + list(range(5, la_businesses_master.shape[1]))]]).groupby('NEIGHBORHOOD').count().sort_values('BUSINESS NAME',ascending=False)
(la_businesses_master.loc[la_businesses_master['Cluster Labels'] == 2, la_businesses_master.columns[[1] + list(range(5, la_businesses_master.shape[1]))]]).groupby('NEIGHBORHOOD').count().sort_values('BUSINESS NAME',ascending=False)
(la_businesses_master.loc[la_businesses_master['Cluster Labels'] == 3, la_businesses_master.columns[[1] + list(range(5, la_businesses_master.shape[1]))]]).groupby('NEIGHBORHOOD').count().sort_values('BUSINESS NAME',ascending=False)